In [1]:
pip install fastcluster
Defaulting to user installation because normal site-packages is not writeable Requirement already satisfied: fastcluster in /home/msds2024/jramoso/.local/lib/python3.10/site-packages (1.2.6) Requirement already satisfied: numpy>=1.9 in /opt/conda/lib/python3.10/site-packages (from fastcluster) (1.24.3) Note: you may need to restart the kernel to use updated packages.
In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import (calinski_harabasz_score,
davies_bouldin_score,
silhouette_score,)
from sklearn.base import clone
from scipy.spatial.distance import euclidean, cityblock
import scipy.spatial.distance as distance
import sqlite3
import warnings
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn_extra.cluster import KMedoids
from scipy.cluster.hierarchy import dendrogram, linkage
import fastcluster
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import OPTICS, cluster_optics_dbscan
warnings.filterwarnings("ignore")
A. Exploratory Data Analysis
In [3]:
def plot_scoring_stats(scoring_stats, title):
"""
Plot all scoring statistics as box plots in a single figure.
"""
scoring_stats_melted = scoring_stats.melt(
var_name='Statistic', value_name='Value')
navy_blue_palette = ["#87CEEB"] * len(scoring_stats.columns)
plt.figure(figsize=(20, 8))
sns.boxplot(x='Statistic', y='Value',
data=scoring_stats_melted, palette=navy_blue_palette)
plt.xticks(rotation=45) # Rotate the x labels for better readability
plt.title(title, fontsize=20)
plt.xlabel('Scoring Attribute', fontsize=14)
plt.ylabel('Attribute Rating', fontsize=14)
plt.tight_layout()
plt.show()
def plot_passing_stats(passing_stats, fig_num):
"""
Plot distribution of passing statistics
"""
plt.figure(figsize=(18, 12))
plt.suptitle(
f"Figure {fig_num}. Passing Statistics", fontsize=20, y=0.97)
plt.subplot(2, 2, 1)
sns.histplot(passing_stats['crossing'], kde=True,
color='b', edgecolor='black')
plt.title(f'Figure {fig_num}A: Crossing Distribution')
plt.xlabel('Attribute Rating')
plt.ylabel('Frequency')
plt.subplot(2, 2, 2)
sns.histplot(passing_stats['short_passing'],
kde=True, color='b', edgecolor='black')
plt.title(f'Figure {fig_num}B: Short Passing Distribution')
plt.xlabel('Attribute Rating')
plt.ylabel('Frequency')
plt.subplot(2, 2, 3)
sns.histplot(passing_stats['long_passing'],
kde=True, color='b', edgecolor='black')
plt.title(f'Figure {fig_num}C: Long Passing Distribution')
plt.xlabel('Attribute Rating')
plt.ylabel('Frequency')
plt.subplot(2, 2, 4)
sns.histplot(passing_stats['vision'], kde=True,
color='b', edgecolor='black')
plt.title(f'Figure {fig_num}D: Vision Distribution')
plt.xlabel('Attribute Rating')
plt.ylabel('Frequency')
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()
def plot_off_skillIQ_stats(off_skillIQ_stats, fig_num):
"""
Plot distribution of offensive skills/IQ statistics
"""
plt.figure(figsize=(18, 8))
plt.suptitle(
f"Figure {fig_num}. Offensive Skills/IQ Statistics",
fontsize=20, y=0.97)
plt.subplot(1, 3, 1)
sns.histplot(off_skillIQ_stats['dribbling'],
kde=True, color='b', edgecolor='black')
plt.title(f'Figure {fig_num}A: Dribbling')
plt.xlabel('Attribute Rating')
plt.ylabel('Frequency')
plt.subplot(1, 3, 2)
sns.histplot(off_skillIQ_stats['ball_control'],
kde=True, color='b', edgecolor='black')
plt.title(f'Figure {fig_num}B: Ball Control')
plt.xlabel('Attribute Rating')
plt.ylabel('Frequency')
plt.subplot(1, 3, 3)
sns.histplot(off_skillIQ_stats['positioning'],
kde=True, color='b', edgecolor='black')
plt.title(f'Figure {fig_num}C: Positioning')
plt.xlabel('Attribute Rating')
plt.ylabel('Frequency')
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()
def plot_def_stats(def_stats, fig_num):
"""
Plot distribution of defensive statistics
"""
plt.figure(figsize=(18, 8))
plt.suptitle(
f"Figure {fig_num}. Defensive Statistics", fontsize=20, y=0.97)
plt.subplot(1, 3, 1)
sns.histplot(def_stats['interceptions'], kde=True,
color='b', edgecolor='black')
plt.title(f'Figure {fig_num}A: Interceptions')
plt.xlabel('Attribute Rating')
plt.ylabel('Frequency')
plt.subplot(1, 3, 2)
sns.histplot(def_stats['standing_tackle'],
kde=True, color='b', edgecolor='black')
plt.title(f'Figure {fig_num}B: Standing Tackle')
plt.xlabel('Attribute Rating')
plt.ylabel('Frequency')
plt.subplot(1, 3, 3)
sns.histplot(def_stats['sliding_tackle'], kde=True,
color='b', edgecolor='black')
plt.title(f'Figure {fig_num}C: Sliding Tackle')
plt.xlabel('Attribute Rating')
plt.ylabel('Frequency')
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()
def plot_def_skillIQ_stats(def_skillIQ_stats, fig_num):
"""
Plot distribution of defensive skills/IQ statistics
"""
plt.figure(figsize=(18, 8))
plt.suptitle(
f"Figure {fig_num}. Defensive Skills / IQ Statistics",
fontsize=20, y=0.97)
plt.subplot(1, 2, 1)
sns.histplot(def_skillIQ_stats['aggression'],
kde=True, color='b', edgecolor='black')
plt.title(f'Figure {fig_num}A: Aggression')
plt.xlabel('Attribute Rating')
plt.ylabel('Frequency')
plt.subplot(1, 2, 2)
sns.histplot(def_skillIQ_stats['marking'],
kde=True, color='b', edgecolor='black')
plt.title(f'Figure {fig_num}B: Marking')
plt.xlabel('Attribute Rating')
plt.ylabel('Frequency')
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()
def plot_physical_stats(stats, fig_num):
"""
Plot all physical statistics as box plots in a single figure.
"""
physical_stats_melted = stats.melt(
var_name='Statistic', value_name='Value')
navy_blue_palette = ["#87CEEB"] * len(stats.columns)
plt.figure(figsize=(20, 8))
sns.boxplot(x='Statistic', y='Value',
data=physical_stats_melted,
palette=navy_blue_palette)
plt.xticks(rotation=45)
plt.title(f'Figure {fig_num}. Physical Statistics Box Plots',
fontsize=20)
plt.xlabel('Physical Attribute', fontsize=14)
plt.ylabel('Attribute Rating', fontsize=14)
plt.tight_layout()
plt.show()
B. Model Implementation
B.1 Internal Validation Statistics Functions
In [4]:
def pooled_within_ssd(X, y, centroids, dist):
"""
Compute pooled within-cluster sum of squares around the cluster mean
"""
# COMBINE X AND LABELS
X_new = pd.DataFrame(X)
X_new[len(X_new.columns)] = y
# COMBINE CENTROIDS AND LABELS
cent_new = pd.DataFrame(centroids)
labels = np.unique(y)
cent_new[len(cent_new.columns)] = labels
# COMPUTE FOR DISTANCES
cumulative = 0
for label in labels:
array1 = X_new[X_new.iloc[:, -1]
== label].iloc[:, :-1].to_numpy()
array2 = cent_new[cent_new.iloc[:, -1]
== label].iloc[:, :-1].to_numpy()
distances = np.square(distance.cdist(
array1, array2, metric=dist)) * 1/(2*array1.shape[0])
cumulative += distances.sum()
return cumulative
# Realization Generator
def gen_realizations(X, b, random_state=None):
"""
Generate b random realizations of X
The realizations are drawn from a uniform
distribution over the range of
observed values for that feature.
"""
mins = X.min(axis=0)
maxs = X.max(axis=0)
rng = np.random.default_rng(random_state)
nrows, ncols = X.shape
return rng.uniform(
np.tile(mins, (b, nrows, 1)),
np.tile(maxs, (b, nrows, 1)),
size=(b, nrows, ncols),
)
# Gap Statistics
def gap_statistic(X, y, centroids, dist, b, clusterer, random_state=None):
"""
Compute the gap statistic
"""
X_refs = gen_realizations(X, b, random_state)
wk = pooled_within_ssd(X, y, centroids, dist)
cum_gap = 0
gaps = []
for data in X_refs:
cluster_data = clusterer.fit(data)
wki = pooled_within_ssd(data, cluster_data.labels_,
cluster_data.cluster_centers_, dist)
gap_i = np.log(wki) - np.log(wk)
cum_gap += gap_i
gaps.append(gap_i)
return [cum_gap/b, np.std(gaps)]
# Generate Statistics
def cluster_range(X, clusterer, k_start, k_stop):
"""
Cluster X for different values of k
"""
ys = []
centers = []
inertias = []
chs = []
scs = []
dbs = []
gss = []
gssds = []
for k in range(k_start, k_stop + 1):
clusterer_k = clone(clusterer)
clusterer_k.set_params(n_clusters=k)
cluster_data = clusterer_k.fit(X)
y = clusterer_k.fit_predict(X)
ys.append(y)
centers.append(clusterer_k.cluster_centers_)
inertias.append(cluster_data.inertia_)
chs.append(calinski_harabasz_score(X, y))
scs.append(silhouette_score(X, y))
dbs.append(davies_bouldin_score(X, y))
# raise NotImplementedError()
gs = gap_statistic(
X,
y,
clusterer_k.cluster_centers_,
euclidean,
5,
clone(clusterer).set_params(n_clusters=k),
random_state=1337,
)
gss.append(gs[0])
gssds.append(gs[1])
result_dict = {
"ys": ys,
"centers": centers,
"inertias": inertias,
"chs": chs,
"scs": scs,
"dbs": dbs,
"gss": gss,
"gssds": gssds,
}
return result_dict
# Plot Statistics
def plot_internal(inertias, chs, scs, dbs, gss, gssds, title):
"""Plot internal validation values"""
fig, ax = plt.subplots()
ks = np.arange(2, len(inertias) + 2)
ax.plot(ks, inertias, "-o", label="SSE")
ax.plot(ks, chs, "-ro", label="CH")
ax.set_xlabel("$k$")
ax.set_ylabel("SSE/CH")
lines, labels = ax.get_legend_handles_labels()
ax2 = ax.twinx()
ax2.errorbar(ks, gss, gssds, fmt="-go",
label="Gap statistic")
ax2.plot(ks, scs, "-ko", label="Silhouette coefficient")
ax2.plot(ks, dbs, "-gs", label="DB")
ax2.set_ylabel("Gap statistic/Silhouette/DB")
lines2, labels2 = ax2.get_legend_handles_labels()
ax2.legend(lines + lines2, labels + labels2)
plt.title(title)
return ax
def axis_plot(res, fig_num):
"""
Plot all internal validation
statistics in separate axes
"""
player_stats = pd.DataFrame(res_player)
player_stats['k'] = np.arange(2, 16)
player_stats = player_stats.set_index('k')
figure_letter = ['A', 'B', 'C', 'D', 'E']
headers = ['SSE', 'Calinski-Harabasz Index',
'Silhoutte Coefficient', 'Davies-Bouldin Index',
'Gap Statistic']
fig, axs = plt.subplots(2, 3, figsize=(15, 10))
axs = axs.flatten()
plt.suptitle(
f"Figure {fig_num}. Internal Validation Statistics for Kmeans",
fontsize=20, y=1)
for i, stat in enumerate(player_stats.iloc[:, 2:-1].columns):
axs[i].plot(player_stats.index, player_stats[stat],
marker='o')
axs[i].set_title(f'Figure {fig_num}{figure_letter[i]}. {headers[i]}')
axs[i].set_xlabel('k')
axs[i].set_ylabel(stat)
if len(player_stats.iloc[:, 2:-1].columns) < len(axs):
for ax in axs[len(player_stats.iloc[:, 2:-1].columns):]:
ax.set_visible(False)
plt.tight_layout()
plt.show()
B.2 Dimensionality Reduction Functions
In [5]:
def plot_ve(variance_explained, title):
"""
Plot variance explained and cumulative variance explained
"""
fig, ax = plt.subplots()
ax.set_title(title)
ax.plot(range(1, len(variance_explained)+1),
variance_explained, '-', label='individual')
ax.set_xlim(0, len(variance_explained)+1)
ax.set_xlabel('PCs')
ax.set_ylabel('variance explained')
ax = ax.twinx()
ax.plot(range(1, len(variance_explained)+1),
variance_explained.cumsum(), 'r-', label='cumulative')
ax.axhline(0.81, ls='--', color='g')
ax.axvline(5, ls='--', color='g')
ax.set_ylabel('cumulative variance explained');
B.3 Model Functions
In [6]:
def run_kmeans(X, clusters, title):
"""
Run Kmeans and return clusters and KMeans Classifier
"""
kmeans_player = KMeans(n_clusters=clusters,
random_state=1337, n_init='auto')
clusters = kmeans_player.fit_predict(X)
# Plot the clusters
plt.figure(figsize=(10, 6))
unique_labels = set(clusters)
colors = plt.cm.rainbow(np.linspace(0, 1, len(unique_labels)))
for k, col in zip(unique_labels, colors):
class_member_mask = (clusters == k)
plt.scatter(X_player_new[class_member_mask, 0],
X_player_new[class_member_mask, 1],
color=col, edgecolor='black', s=50,
label=f'Cluster {k}')
# Plot the centroids
centroids = kmeans_player.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1], s=200,
c='black', alpha=1, label='Centroids',
marker='X')
plt.title(title)
plt.xlabel('PCA Feature 1')
plt.ylabel('PCA Feature 2')
plt.legend()
return kmeans_player, clusters
def run_heirarchical(X, method, title):
"""
Return dendrogram of chosen
heirarchical clustering method
"""
Z = fastcluster.linkage(X, method=method)
fig, ax = plt.subplots()
dn = dendrogram(Z, ax=ax, p=20,
truncate_mode='lastp',)
ax.set_ylabel(r"$h$")
plt.title(title)
def plot_clustered_points(X, c, method, title):
"""
Plot clustered data points using
agglomerative clustering method
"""
agg = AgglomerativeClustering(
n_clusters=c, linkage=method, distance_threshold=None
)
clusters = agg.fit_predict(X)
# Plot the clusters
plt.figure(figsize=(10, 6))
unique_labels = set(clusters)
colors = plt.cm.rainbow(np.linspace(0, 1,
len(unique_labels)))
for k, col in zip(unique_labels, colors):
class_member_mask = (clusters == k)
plt.scatter(X_player_new[class_member_mask, 0],
X_player_new[class_member_mask, 1],
color=col, edgecolor='black', s=50,
label=f'Cluster {k}')
plt.title(title)
plt.xlabel('PCA Feature 1')
plt.ylabel('PCA Feature 2')
plt.legend()
plt.show()
def reachibility_plot(X, title):
"""
Generate reachability plot and return optics class
"""
optics = OPTICS(min_samples=4, cluster_method="dbscan")
optics.fit(X)
plt.plot(optics.reachability_[optics.ordering_], ".-")
plt.title(title)
plt.ylabel("reachability")
return optics
def plot_optics_points(optics, thres, title):
"""
Plot data points after clustering using OPTICS-DBSCAN
"""
clusters = cluster_optics_dbscan(
reachability=optics.reachability_,
core_distances=optics.core_distances_,
ordering=optics.ordering_,
eps=thres,
)
print("Number of clusters:", clusters.max() + 1)
print("Number of noise points:", (clusters == -1).sum())
print(
"Number of points in the largest cluster:",
np.bincount(clusters[clusters >= 0]).max(),
)
print("Number of points:", len(clusters))
print("Silhouette score:",
silhouette_score(X_player_new, clusters))
# Plot the clusters
plt.figure(figsize=(10, 6))
unique_labels = set(clusters)
colors = plt.cm.rainbow(np.linspace(0, 1, len(unique_labels)))
for k, col in zip(unique_labels, colors):
class_member_mask = (clusters == k)
plt.scatter(X_player_new[class_member_mask, 0],
X_player_new[class_member_mask, 1],
color=col, edgecolor='black', s=50,
label=f'Cluster {k}')
plt.title(title)
plt.xlabel('PCA Feature 1')
plt.ylabel('PCA Feature 2')
plt.legend()
plt.show()
def run_kmedoids(X, clusters, title):
"""
Return clusters and Kmedoids class
"""
kmedoids_player = KMedoids(n_clusters=clusters,
random_state=1337)
clusters = kmedoids_player.fit_predict(X_player_new)
# Plot the clusters
plt.figure(figsize=(10, 6))
unique_labels = set(clusters)
colors = plt.cm.rainbow(np.linspace(0, 1, len(unique_labels)))
for k, col in zip(unique_labels, colors):
class_member_mask = (clusters == k)
plt.scatter(X_player_new[class_member_mask, 0],
X_player_new[class_member_mask, 1],
color=col, edgecolor='black', s=50,
label=f'Cluster {k}')
# Plot the centroids
centroids = kmedoids_player.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1], s=200,
c='black', alpha=1, label='Centroids', marker='X')
plt.title(title)
plt.xlabel('PCA Feature 1')
plt.ylabel('PCA Feature 2')
plt.legend()
plt.show()
return kmedoids_player, clusters